import os

from transformers import AutoTokenizer
from transformers import Qwen2Tokenizer

# 初始化分词器
model_dir = os.path.join('D:', os.path.sep, 'ModelSpace', 'Qwen2.5', 'Qwen2.5-1.5B-Instruct')
tokenizer = Qwen2Tokenizer.from_pretrained(
    model_dir,
    local_files_only=True,
)
print(tokenizer)

# 存储分词器
save_dir = os.path.join('D:', os.path.sep, 'ModelSpace', 'Qwen2.5', 'Qwen2.5-1.5B-Instruct-COPY')
tokenizer.save_pretrained(save_dir)


text = 'Transformers分词：台风又双叒叕来了！'
tokens = tokenizer.tokenize(text)
print(tokens)

ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

token_ids = tokenizer.encode(text)
print(token_ids)
